# filesystem
library(here)

# data manipulation
library(magrittr)
library(tidyverse)
library(labelled)
library(janitor)

# text analysis
library(tidytext)
library(quanteda)
library(quanteda.textstats)
library(tm)
library(entropy)
library(philentropy)

# estimation
library(estimatr)

# reporting and viz
library(ggplot2)
library(ggpubr)
library(officer)
library(flextable)
library(extrafont)

loadfonts(device = "win")
theme_set(
  theme_bw() +
    theme(
      axis.text.y = element_text(lineheight = 1, hjust = 0, color = "black"),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      panel.border = element_blank(),
      # axis.line = element_blank(), #element_line(colour = "white"),
      axis.title.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.x = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank()
    )
)

# Helpers -----------------------------------------------------------------

adj_bhq <- function(tidy_coefs, alpha = 0.05) {
  # take output of `broom::tidy()`, i.e. coefficients
  # and perform Benjamini-Hochberg corrections for multiple hypothesis testing
  # (i.e. like Bonferonni correction, but less harsh)
  
  tidy_coefs <- tidy_coefs[order(tidy_coefs$p.value),] ##stacked broomed models
  k <- nrow(tidy_coefs) ##number of hypotheses 
  r <- 1:k ##ranks of p-values
  alpha_adj <- (r*alpha)/k ##step-up 'tailored' thresholds
  sig_BH <- tidy_coefs$p.value < alpha_adj ##step-up hypothesis tests 
  z_crit_adj <- qnorm(1 - (alpha_adj)/2) ##new critical values for CIs (asymptotic)
  
  tidy_coefs$conf.low <-
    with(tidy_coefs, estimate - z_crit_adj*std.error)
  tidy_coefs$conf.high <-
    with(tidy_coefs, estimate + z_crit_adj*std.error)
  tidy_coefs$stat.sig <- sig_BH
  
  return(tidy_coefs)
}

make_full_confusion_heatmap <- function(d) {
  p <- d %>%
    mutate(asked = fct_relevel(asked, "none_of_the_above", after = Inf),
           correct = fct_relevel(correct, "none_of_the_above", after = Inf)) %>%
    ggplot(aes(y=asked, x=correct, fill=pct)) +
    scale_fill_viridis_b(labels = scales::percent_format(1),
                         name = "Percent (Row-Wise):") +
    scale_x_discrete(labels = \(.) str_replace_all(., "_", " ") %>%
                       str_to_title(.) %>% 
                       str_remove_all(" Occupation") %>%
                       str_replace("Democratic / Biden Administration", "Dem. / Biden") %>%
                       gsub("((Abc|Nbc|Cbs|Tv|tv|Cnn|Npr))", "\\U\\1", ., perl= TRUE, ignore.case=T)) +
    scale_y_discrete(labels = \(.) str_replace_all(., "_", " ") %>%
                       str_to_title(.) %>%
                       str_remove_all(" Occupation") %>%
                       str_replace("Democratic / Biden Administration", "Dem. / Biden") %>%
                       gsub("((Abc|Nbc|Cbs|Tv|tv|Cnn|Npr))", "\\U\\1", ., perl= TRUE, ignore.case=T), 
                     limits = rev) +
    geom_tile(color = "grey") +
    # coord_fixed()
    geom_text(aes(label = sprintf("%0.0f%%", round(100*pct, 0)),
                  color = ifelse(pct < 0.5, "white", "black")), fontface = "bold") +
    scale_color_identity() +
    # theme_norc(gridlines = T) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 30, size=16, hjust = 1, lineheight = 1),
          strip.text = element_text(size=22, lineheight = 1),
          panel.grid.major = element_blank(),
          legend.position = "top",
          # panel.background = element_rect(fill = NA),
          # panel.ontop = TRUE,
          legend.title = element_text(size=18),
          axis.text = element_text(size=18, lineheight = 1),
          axis.title = element_text(size=20, lineheight = 1),
          plot.caption = element_text(size = 12, lineheight = 1))
  
  # hack for grid lines ... theme_norc() doesn't allow
  for (i in 1:length(unique(d$asked))) {
    p <- p +
      geom_vline(xintercept=i+.5, lwd=0.25, alpha=0.25) +
      geom_hline(yintercept=i+.5, lwd=0.25, alpha=0.25)
  }
  return(p)
}

make_category_correlation_plot <- function(d, xvar, yvar) {
  d.pct <- d %>% 
    filter(type %in% c(xvar, yvar)) %>%
    select(-count) %>%
    pivot_wider(names_from = type, values_from = pct, values_fill = 0) %>%
    rename_at(xvar, ~"xvar") %>%
    rename_at(yvar, ~"yvar")
  d.n <- d %>% 
    filter(type %in% c(xvar, yvar)) %>%
    select(-pct) %>%
    pivot_wider(names_from = type, values_from = count, values_fill = 0) %>%
    rename_at(xvar, ~"xvar") %>%
    rename_at(yvar, ~"yvar")
  chi.pval <- stats::chisq.test(x=d.n$yvar, p=d.pct$xvar)$p.value
  p <- d.pct %>%
    mutate(cat = cat %>% 
             str_replace_all("_", " ") %>%
             str_to_title() %>% 
             str_replace_all(" Occupations", "") %>%
             gsub("((Abc|Nbc|Cbs|Tv|tv|Cnn|Npr))", "\\U\\1", ., perl= TRUE, ignore.case=T)) %>%
    ggplot(aes(x=xvar, y=yvar)) +
    geom_point() +
    labs(caption = sprintf("Note: Pearson's correlation coefficient = %0.2f", 
                           cor(d.pct$xvar, d.pct$yvar))) +
    # labs(caption = sprintf("Note: Pearson's correlation coefficient = %0.2f, Chi-square p-value %s", 
    #                        cor(d.pct$xvar, d.pct$yvar),
    #                        ifelse(chi.pval < 0.01, "< 0.01", paste0("= ", round(chi.pval, 2))))) +
    geom_abline(intercept = 0, slope = 1, lty = 2 , alpha = 0.5) +
    scale_x_continuous(labels = scales::percent_format(1), 
                       name=xvar, limits = c(0, max(c(d.pct$xvar, d.pct$yvar)))) + 
    scale_y_continuous(labels = scales::percent_format(1), 
                       name=yvar, limits = c(0, max(c(d.pct$xvar, d.pct$yvar)))) + 
    theme_bw() +
    theme(axis.text = element_text(size = 12),
          strip.text = element_text(size=22),
          plot.title = element_text(size=22, face = "bold"),
          # panel.background = element_rect(fill = NA),
          # panel.ontop = TRUE,
          axis.title = element_text(size=16, lineheight = 1),
          plot.caption = element_text(size = 11, lineheight = 1))
  return(p)
}

tab.num <- 0
number_tab <- function(cap) {
  tab.num <<- tab.num + 1
  paste0("Table ", tab.num,". ",cap)
}

fig.num <- 0
number_fig <- function(cap) {
  fig.num <<- fig.num + 1
  paste0("Figure ", fig.num,". ",cap)
}

str_filter <- function(str, rgx) {
  str[!grepl(rgx, str, ignore.case=T)]
}

# Data --------------------------------------------------------------------
message("Reading data...")

if (!"control" %in% ls())
  control    <- readRDS(file = here("data_clean/control.rds"))
if (!"treat_conf" %in% ls())
  treat_conf <- readRDS(file = here("data_clean/treat_conf.rds"))
if (!"treat_elab" %in% ls())
  treat_elab <- readRDS(file = here("data_clean/treat_elab.rds"))
if (!"combined" %in% ls())
  combined   <- readRDS(file = here("data_clean/combined.rds"))

# Codings of quality (combined response, control + elab)
coded_qual <- readRDS(file = here("data_clean/coded_qual.rds"))
# Codings of quality (pre-probe response, elab)
coded_qual_pre <- readRDS(file = here("data_clean/coded_qual_pre.rds"))
# Codings of category
coded_cats <- readRDS(file = here("data_clean/coded_cats.rds"))

# Global Variables --------------------------------------------------------
message("Setting globals...")

global_stat_sig_level = 0.05
stat_sig_label = paste("p <", global_stat_sig_level)
stat_insig_label = paste("p >=", global_stat_sig_level)

exp_ordered_labels = list(
  "issue" = "Most Imp. Issue",
  "econd" = "Econ. Cond.",
  "econd_tone" = "Econ. Cond.\n(Tone)",
  "econd_reason" = "Econ. Cond.\n(Reason)",
  "news" = "Pref. News",
  "occu" = "Main Occu."
)

var_labels <- list(
  "work_status" = "Employed",
  "gender" = "Gender",
  "educ5" = "Educ.",
  "age5" = "Age",
  "device_type" = "Device",
  "hh_inc5" = "HH. Income"
)

coders = c("Wang","Oya","Martin")

# note: removing examples, b/c collapsing examples and specific criteria
quality_criteria_ = c("relevant","specific","explanatory",
                      "incomplete","incomprehensible","redundant")
quality_criteria_rev = c("incomplete","incomprehensible","redundant")
quality_criteria_rev_new <- list("incomplete"="complete",
                                 "incomprehensible"="comprehensible",
                                 "redundant"="concise")
quality_criteria <- c("relevant","specific","explanatory",
                      "complete","comprehensible","concise")

ux_vars = c("quality","formal_tone","ease",
            "frustration","satisfaction")

# given more time, would do this in a less stupid way
exp_ordered_labels_f = function(.x) {
  case_match(.x,
             "issue" ~ "Most Imp. Issue",
             "econd" ~ "Econ. Cond.",
             "econd_tone" ~ "Econ. Cond.\n(Tone)",
             "econd_reason" ~ "Econ. Cond.\n(Reason)",
             "news" ~ "Pref. News",
             "occu" ~ "Main Occu.") %>%
    factor(., levels = unname(unlist(exp_ordered_labels)))
}
# exp_ordered_labels_f <- \(.x) factor(exp_ordered_labels[[.x]], levels=unname(unlist(exp_ordered_labels)))
# exp_ordered_labels_f <- \(.w) purrr::map_vec(.w, ~factor(exp_ordered_labels[[.x]], levels=unname(unlist(exp_ordered_labels))))


# Prep --------------------------------------------------------------------
message("Prepping variables...")

combined$age <- ifelse(is.na(combined$age), combined$panel_age, combined$age)

## Imputation/Labeling ----
if ("Elaboration" %in% unique(combined$treatment)) {
  combined <- combined %>%
    mutate(treatment = fct_recode(treatment, "Elab/Rel."="Elaboration", "Conf."="Confirmation"))
}

combined <- combined %>% 
  mutate(gender = case_when(
    grepl("^male", gender, ignore.case=T) | grepl("^male", panel_gender, ignore.case=T) ~
      "Male",
    grepl("female", gender, ignore.case=T) | grepl("female", panel_gender, ignore.case=T) ~
      "Female",
    str_detect(gender_specify, "i am male") ~ "Male",
    TRUE ~ "Other/Refused"
  ) %>% factor(levels=c("Male","Female","Other/Refused")))

combined <- combined %>%
  mutate(hh_income = panel_hh_income) %>%
  mutate(
    hh_inc5 = case_when(
      hh_income %in% c("Less than $5,000", "$5,000 to $9,999", 
                       "$10,000 to $14,999", "$15,000 to $19,999", 
                       "$20,000 to $24,999", "$25,000 to $29,999", 
                       "$30,000 to $34,999", "$35,000 to $39,999") ~ "<$40k",
      hh_income %in% c("$40,000 to $44,999", "$45,000 to $49,999", 
                       "$50,000 to $54,999", "$55,000 to $59,999", 
                       "$60,000 to $64,999", "$65,000 to $69,999", 
                       "$70,000 to $74,999", "$75,000 to $79,999") ~ "$40-80k",
      hh_income %in% c("$80,000 to $84,999", "$85,000 to $89,999",
                       "$90,000 to $94,999", "$95,000 to $99,999") ~ "$80-100k",
      hh_income %in% c("$100,000 to $124,999", "$125,000 to $149,999") ~ "$100-150k", 
      hh_income %in% c("$150,000 to $174,999", "$175,000 to $199,999",
                       "$200,000 to $249,999", "$250,000 to $499,999", 
                       "$500,000 to $999,999", "$1 million +") ~ ">$150k",
      TRUE ~ "IDK/Refused"
    ) %>%
      factor(
        levels = c(
          "<$40k", "$40-80k", "$80-100k", "$100-150k", ">$150k", "IDK/Refused"
        )
      )
  )

combined <- combined %>%
  mutate(age = ifelse(is.na(age), panel_age, age)) %>%
  mutate(age5 = cut(
    age, 
    breaks=c(18,30,40,50,60,100),
    labels=c("18-29","30-39","40-49","50-59","60+"),
    right=F
  ) %>% forcats::fct_na_value_to_level(level = "Refused"))

combined <- combined %>% 
  mutate(educ5 = educ %>%
           forcats::fct_na_value_to_level(level = "IDK/Refused") %>%
           forcats::fct_collapse("Post-grad"="postgraduate study/professional degree",
                                 "Bachelor's"="bachelor’s degree",
                                 "Some college"="some college/associate’s degree",
                                 "HS or equiv."="high school graduate or equivalent",
                                 "Less than HS"="less than high school") %>%
           forcats::fct_relevel("Less than HS", "HS or equiv.", "Some college", "Bachelor's", "Post-grad", "IDK/Refused"))

combined <- combined %>%
  mutate(
    device_type = forcats::fct_relabel(device_type, str_to_sentence),
    work_status = forcats::fct_relabel(work_status, str_to_sentence) %>%
      forcats::fct_na_value_to_level(level = "IDK/Refused")
  )

## Completes ----

if ("complete" %in% colnames(combined)) {
  combined <- combined %>%
    rename(completed = complete) # avoid clash with 'complete' criterion
}

combined_complete <- combined %>% 
  filter(status=="complete")

## Flags ----
flags_on_l <- bind_rows(
  control %>%
    select(matches("on_for")) %>%
    summarise(across(everything(), ~sum(.x, na.rm=T))) %>%
    mutate(treat = "Control")
  ,
  treat_conf %>%
    select(matches("on_for")) %>%
    summarise(across(everything(), ~sum(.x, na.rm=T))) %>%
    mutate(treat = "Conf.")
  ,
  treat_elab %>%
    select(matches("on_for")) %>%
    summarise(across(everything(), ~sum(.x, na.rm=T))) %>%
    mutate(treat = "Elab/Rel.")
) %>% 
  select(treat, 
         matches("issue"), matches("econd"), 
         matches("news"), matches("occu")) %>%
  pivot_longer(-treat, 
               names_to = c("exp", "flag"),  names_pattern = "(.*)_demerit_on_for\\.(.*)", 
               values_to = "n_on", values_drop_na = TRUE) %>%
  arrange(exp, flag) %>%
  group_by(exp, flag) %>%
  # filter(length(unique(treat)) > 1) %>%
  ungroup()

### identify which flags can be compared across conditions
flags_on_w <- flags_on_l %>%
  pivot_wider(id_cols = c(exp, flag), 
              names_from = "treat", values_from = "n_on")

### identify which flags actually detected
flags_detected_l <- bind_rows(
  control %>%
    select(matches("flagged\\.")) %>%
    summarise(across(everything(), ~mean(.x, na.rm=T))) %>%
    mutate(treat = "Control")
  ,
  treat_conf %>%
    select(matches("flagged\\.")) %>%
    summarise(across(everything(), ~mean(.x, na.rm=T))) %>%
    mutate(treat = "Conf.")
  ,
  treat_elab %>%
    select(matches("flagged\\.")) %>%
    summarise(across(everything(), ~mean(.x, na.rm=T))) %>%
    mutate(treat = "Elab/Rel.")
) %>% 
  select(treat, 
         matches("issue"), matches("econd"), 
         matches("news"), matches("occu")) %>%
  pivot_longer(-treat, 
               names_to = c("resp", "flag"),  
               names_pattern = "(.*)_demerit_flagged\\.(.*)", 
               values_to = "rate_detected") %>%
  mutate(exp = gsub("(_\\d|_\\d_probe)$", "",resp))


## Codes ----
### categories
coded_cats_n <- coded_cats %>%
  # remove any instances where a coder coded multiple categories for a response
  distinct(treat, respondent_id, q, coder, .keep_all = TRUE) %>%
  group_by(treat, respondent_id, q) %>%
  summarise(n_cats = length(unique(cat)),
            .groups = "drop")

coded_cats_resolved <- coded_cats %>%
  # remove any instances where a coder coded multiple categories for a response
  distinct(treat, respondent_id, q, coder, .keep_all = TRUE) %>%
  group_by(treat, respondent_id, q) %>% 
  # picking agreement or a majority .. otherwise picking randomly from one of the coders
  summarise_at("cat", list(cat_maj=~names(sort(-table(.x)))[1], 
                           cats_uniq=~length(unique(.x)))) %>%
  ungroup() %>%
  arrange(respondent_id, q)

treat_conf <- treat_conf %>%
  left_join(
    coded_cats_resolved %>%
      select(-cats_uniq, -treat) %>%
      pivot_wider(names_from = q,
                  names_glue = "{q}_coded",
                  values_from = cat_maj),
    by = "respondent_id"
  )
    
### quality
coded_qual_resolved <- coded_qual %>%
  # combine specific and examples categories
  mutate(specific = ifelse(examples==1|specific==1, 1, 0)) %>%
  select(-examples) %>%
  group_by(treat, respondent_id, q) %>%
  # take average rating across coders
  summarise(across(all_of(quality_criteria_), 
                   ~mean(.x, na.rm=T)), .groups = "drop") %>%
  # reverse code
  mutate(across(all_of(quality_criteria_rev), 
                ~case_match(.x, 0 ~ 1, 1 ~ 0, .default = NA))) %>%
  rename_with(~ unlist(quality_criteria_rev_new[.x]) %||% .x, 
              names(quality_criteria_rev_new))

coded_qual_pre_resolved <- coded_qual_pre %>%
  # combine specific and examples categories
  mutate(specific = ifelse(examples==1|specific==1, 1, 0)) %>%
  select(-examples) %>%
  group_by(treat, respondent_id, q) %>%
  # take average rating across coders
  summarise(across(all_of(quality_criteria_), 
                   ~mean(.x, na.rm=T)), .groups = "drop") %>%
  # reverse code
  mutate(across(all_of(quality_criteria_rev), 
                ~case_match(.x, 0 ~ 1, 1 ~ 0, .default = NA))) %>%
  rename_with(~ unlist(quality_criteria_rev_new[.x]) %||% .x, 
              names(quality_criteria_rev_new))

## NLP ----

col_stem <- c(
  # "issue_1", "issue_2",
  "issue_combined",
  "issue_last",
  # "econd_1", "econd_2",
  "econd_combined",
  "econd_last",
  "econd_reason_1",
  # "news_1", "news_2",
  "news_combined",
  "news_last",
  # "occu_1", "occu_2",
  "occu_combined",
  "occu_last",
  # "why_frustrated_1", 
  # "why_frustrated_2"
  "why_frustrated_combined",
  "why_frustrated_last"
)


combined <- combined %>%
  mutate(issue_combined_response = case_when(
    is.na(issue_2_response) ~ issue_1_response,
    .default = paste0(issue_1_response,". ",issue_2_response)
  ), econd_combined_response = case_when(
    is.na(econd_2_response) ~ econd_1_response,
    .default = paste0(econd_1_response,". ",econd_2_response)
  ), news_combined_response = case_when(
    is.na(news_2_response) ~ news_1_response,
    .default = paste0(news_1_response,". ",news_2_response)
  ), occu_combined_response = case_when(
    is.na(occu_2_response) ~ occu_1_response,
    .default = paste0(occu_1_response,". ",occu_2_response)
  ), why_frustrated_combined_response = case_when(
    is.na(why_frustrated_2_response) ~ why_frustrated_1_response,
    .default = paste0(why_frustrated_1_response,". ",why_frustrated_2_response)
  )) %>%
  mutate(issue_last_response = case_when(
    is.na(issue_2_response) ~ issue_1_response,
    .default = issue_1_response
  ), econd_last_response = case_when(
    is.na(econd_2_response) ~ econd_1_response,
    .default = econd_1_response
  ), news_last_response = case_when(
    is.na(news_2_response) ~ news_1_response,
    .default = news_1_response
  ), occu_last_response = case_when(
    is.na(occu_2_response) ~ occu_1_response,
    .default = occu_1_response
  ), why_frustrated_last_response = case_when(
    is.na(why_frustrated_2_response) ~ why_frustrated_1_response,
    .default = why_frustrated_1_response
  ))

if (file.exists(here("data_clean", "nlp_measures.rds"))) {
  nlp_measures <- readRDS(here("data_clean", "nlp_measures.rds"))
} else {
  nlp_measures <- purrr::map_dfr(col_stem, function(var) {
    var_name  <- paste0(var, "_response")
    message(var_name)
    t0 <- Sys.time()
    
    # tokens
    var_toks <- combined %>% 
      select(treatment, respondent_id, !!sym(var_name)) %>%
      unnest_tokens(word, !!sym(var_name), drop = FALSE)
    
    # cleaned tokens
    var_toks_cleaned <- var_toks %>% 
      # remove standard stopwords
      anti_join(stop_words, by = "word") %>% 
      # drop NA cases 
      filter(!is.na(word)) %>% 
      # remove numbers?
      filter(is.na(as.numeric(word)))
    
    # word frequencies
    freq_SE <- var_toks_cleaned %>% 
      count(respondent_id, word, name = "count_SE") %>% 
      group_by(respondent_id) %>% 
      mutate(prob_SE = count_SE/sum(count_SE)) %>% 
      distinct()
    
    freq_KL <- var_toks_cleaned %>% 
      count(word, name="count_KL") %>% 
      mutate(prob_kl = count_KL/sum(count_KL)) %>% 
      distinct()
    
    ### lexical diversity:
    ### num_unique_words/num_total_words
    lex_div_results <- var_toks %>% 
      group_by(treatment, respondent_id, !!sym(var_name)) %>% 
      summarise(
        unique_words = n_distinct(word),
        total_words = n(),
        lexical_diversity = unique_words/total_words,
        .groups = "drop"
      ) %>% 
      distinct()
    
    ### KL-divergence:
    ### for each respondent, -sum_{i=1:n}(p(word_i)log_10(p(word_i)/q(word_i) 
    ### where p(word_i) is same as before and q(word_i) is the number of times 
    ### that word appears in the entire sample divided by the total number of 
    ### words across all responses in the sample. (I'd be interested in this, 
    ### since this captures how "unique" a respondent's response is relative to others)
    
    P <- freq_SE %>% 
      pivot_wider(id_cols=respondent_id, 
                  names_from="word", values_from = "prob_SE", 
                  names_prefix = "word_", values_fill = 0) %>% 
      clean_names() 
    
    Q <- freq_KL %>% 
      mutate(respondent_id = 1) %>% 
      distinct() %>% 
      select(-count_KL) %>% 
      pivot_wider(names_from="word", values_from = "prob_kl", 
                  names_prefix = "word_", values_fill = 0) %>% 
      clean_names()
    
    # Calculate KL divergence for each respondent
    kl_results <- P %>%
      rowwise() %>%
      mutate(
        kl_divergence = suppressMessages(philentropy::KL(
          as.matrix(rbind(c_across(starts_with("word_")), Q[1, -1])),
          unit = "log10"
        )
        )) %>% 
      select(respondent_id, kl_divergence)
    
    ### shannon entropy:
    ### for each respondent, -sum_{i=1:n}(p(word_i)log_2(p(word_i)) 
    ### where i denotes a unique word that appears, and p(word_i) is
    ### the number of times that word appears in the response divided 
    ### by the total length (in number of words) of the string. 
    ### (a more conservative measure of whether the respondent repeats themselves)
    
    shannon_entropy_results <- freq_SE %>% 
      summarise(shannon_entropy = entropy::entropy(prob_SE, unit = "log2"))
    
    
    ### flesch/flesch-kincaid reading scores:
    ### FK reading ease -- 206.835 - 1.015(tot_words/tot_sentences) - 84.6(tot_syllables/tot_words)
    ### FK grade level -- 0.39(tot_words/tot_sentences) + 11.8(tot_syllables/tot_words)
    
    fk_results <- combined %>% 
      select(respondent_id, !!sym(var_name)) %>%
      mutate(fk_reading = quanteda.textstats::textstat_readability(
        !!sym(var_name), measure = c("Flesch", "Flesch.Kincaid")
      )) %>%
      unnest(fk_reading) %>%
      janitor::clean_names() %>%
      select(respondent_id, f_reading_ease = flesch, fk_grade_level = flesch_kincaid)
    
    print(Sys.time()-t0)
    
    lex_div_results %>%
      select(respondent_id, treatment, unique_words, total_words, lexical_diversity) %>%
      full_join(kl_results, by = "respondent_id") %>%
      full_join(shannon_entropy_results, by = "respondent_id") %>%
      full_join(fk_results, by = "respondent_id") %>%
      mutate(response = var_name) %>%
      relocate(treatment, respondent_id, response)
  })
  
  nlp_measures <- filter(nlp_measures, !is.na(treatment))
  saveRDS(nlp_measures, file = here("data_clean", "nlp_measures.rds"))
}

nlp_measure_names <- list(
  "unique_words"      = "Unique Words",
  "total_words"       = "Total Words",
  "lexical_diversity" = "Lexical Diversity",
  "kl_divergence"     = "KL Divergence",
  "shannon_entropy"   = "Shannon Entropy",
  "f_reading_ease"    = "Fleisch Reading Ease",
  "fk_grade_level"    = "F-K Grade Level"
)

nlp_measures_ordered_labels_f <- function(.x) {
  case_match(.x,
             "unique_words"      ~ "Unique Words",
             "total_words"       ~ "Total Words",
             "lexical_diversity" ~ "Lexical Diversity",
             "kl_divergence"     ~ "KL Divergence",
             "shannon_entropy"   ~ "Shannon Entropy",
             "f_reading_ease"    ~ "Fleisch Reading Ease",
             "fk_grade_level"    ~ "F-K Grade Level") %>%
    factor(., levels = unname(unlist(nlp_measure_names)))
}

na_ <- "none_of_the_above"
sent_ <- c("positive", "negative")


## Confusion Matrices -----------------------------------------------------

full_conf_matrices <- lapply(
  c("issue","econd_reason","news","occu"),
  function (exp) {
    treat_conf %>%
      rename_at(paste0(exp,"_detect_correct"), ~"correct") %>%
      rename_at(paste0(exp,"_confirm"), ~"conf") %>%
      rename_at(paste0(exp,"_to_confirm"), ~"asked") %>%
      rename_at(paste0(exp,"_confirm_other"), ~"other") %>%
      filter(!is.na(correct)) %>%
      mutate(correct = case_when(
        conf == "yes" ~ asked,
        TRUE ~ other
      )) %>%
      count(asked, correct, name = "count") %T>% {
        with(., stopifnot(all(!is.na(count))))
      } %>%
      group_by(asked) %>%
      mutate(pct = count/sum(count)) %>%
      ungroup()
  }) %>% 
  setNames(c("issue","econd_reason","news","occu"))

## Category Distributions -------------------------------------------------

cat_distr <- lapply(
  c("issue","econd_reason","news","occu"),
  function(exp) {
    out <- bind_rows(
      treat_conf %>% 
        select(matches(paste0(exp, ".*target_detected"))) %>% 
        summarise(across(everything(), ~sum(.x, na.rm=T))) %>% 
        pivot_longer(everything(), 
                     names_to = "cat", 
                     values_to = "count") %>% 
        mutate(cat = gsub(paste0(exp,".*detected\\."), "", cat), 
               pct = count/sum(count),
               type = "Detected in Conf.\nCondition OE")
      ,
      full_conf_matrices[[exp]] %>%
        group_by(cat = asked) %>%
        summarise(count = sum(count)) %>%
        mutate(pct = count/sum(count),
               type = "Detected + Probed\nin Conf. Condition OE")
      ,
      full_conf_matrices[[exp]] %>%
        group_by(cat = correct) %>%
        summarise(count = sum(count)) %>%
        mutate(pct = count/sum(count),
               type = "Confirmed by Respondent\nin Conf. Condition OE")
      ,
      coded_cats %>%
        filter(q == exp) %>%
        count(q, cat, name = "count") %>%
        mutate(pct = count/sum(count),
               type = "Coded in Conf.\nCondition OE (Any)")
      ,
      coded_cats_resolved %>%
        filter(q == exp) %>%
        count(q, cat = cat_maj, name = "count") %>%
        mutate(pct = count/sum(count),
               type = "Coded in Conf.\nCondition OE (Majority)")
    ) %>% select(type, cat, pct, count)
    
    # if category distributions are available from other 
    # conditions, add them in
    d.c <- control %>% 
      select(matches(paste0(exp, ".*target_detected")))
    
    if (ncol(d.c) > 0) {
      out <- bind_rows(
        out,
        d.c %>%
          summarise(across(everything(), ~sum(.x, na.rm=T))) %>% 
          pivot_longer(everything(), 
                       names_to = "cat", 
                       values_to = "count") %>% 
          mutate(cat = gsub(paste0(exp,".*detected\\."), "", cat), 
                 pct = count/sum(count),
                 type = "Detected in Control Condition OE")
      )
    }
    d.c <- control %>% 
      select(matches(paste0(exp, "_response_mc")))
    
    if (ncol(d.c) > 0) {
      out <- bind_rows(
        out,
        d.c %>%
          rename_at(paste0(exp, "_response_mc"), ~"cat") %>%
          filter(!is.na(cat)) %>%
          group_by(cat) %>%
          summarise(count = n()) %>%
          mutate(pct = count/sum(count),
                 type = "Answered by Respondent\nin Control Condition CE")
      )
    }
    
    d.t <- treat_elab %>% 
      select(matches(paste0(exp, ".*2_target_detected"))) %>%
      select(-ends_with("n"))
    if (ncol(d.t) > 0) {
      out <- bind_rows(
        out,
        d.t %>%
          summarise(across(everything(), ~sum(.x, na.rm=T))) %>% 
          pivot_longer(everything(), 
                       names_to = "cat", 
                       values_to = "count") %>% 
          mutate(cat = gsub(paste0(exp,".*detected\\."), "", cat), 
                 pct = count/sum(count),
                 type = "Detected in Elab/Rel. Condition OE")
      )
    }
    
    return(out)
  }) %>% 
  setNames(c("issue","econd_reason","news","occu"))

cat_distr$econd_tone <- bind_rows(
  treat_conf %>%
    mutate(cat = case_when(
      econd_tone_1_targets_detected_n == 0 ~ "neither_positive_nor_negative",
      # when not one distinct tone detected, textbot in design 
      # is de facto treating it as negative
      econd_tone_1_targets_detected_n == 2 ~ "negative",
      econd_tone_1_target_detected.positive == 1 ~ "positive",
      econd_tone_1_target_detected.negative == 1 ~ "negative"
    )) %>%
    filter(econd_dropout_b4 == FALSE,
           !is.na(cat)) %>%
    group_by(cat) %>%
    count(name="count") %>%
    ungroup() %>%
    mutate(pct = count/sum(count),
           type = "Detected in Conf.\nCondition OE"),
  treat_conf %>%
    filter(econd_dropout_b4 == FALSE,
           !is.na(econd_tone_confirm_correct)) %>%
    group_by(cat = econd_tone_confirm_correct) %>%
    count(name="count") %>%
    ungroup() %>%
    mutate(pct = count/sum(count),
           type = "Confirmed by Respondent\nin Conf. Condition OE"),
  control %>%
    filter(econd_dropout_b4 == FALSE,
           !is.na(econd_tone)) %>%
    group_by(cat = econd_tone) %>%
    count(name="count") %>%
    ungroup() %>%
    mutate(pct = count/sum(count),
           type = "Answered by Respondent\nin Control Condition CE")
)

## Probe Triggering --------------------------------------------------------

common_elab_probe_lang <- "more information|elaborate|detail|the name|specific|specify|examples|interesting|tell me more|could you explain|how do you think|i understand that|can you tell"
common_not_elab_probe_lang <- "hmm|do you mean that.*or just not|clarify|i'm not sure i understand|misunderstand"

common_rel_probe_lang <- "apologize for the confusion|sorry|clarify|lost me|hmm|do you mean|i didn't quite|i'm not sure i understand|i don't quite understand|i'm sorry, could you please clarify|i didn't understand|spell check|are you saying"
common_not_rel_probe_lang <- ""

cmb_rgx <- function(x, y) paste0(x,"|",y)

## Issue
combined <- combined %>% 
  mutate(issue_elab_probe_flag = #!is.na(issue_2_probe) & 
           !(str_detect(issue_2_probe, regex(common_not_elab_probe_lang, ignore_case = TRUE))) &
           str_detect(issue_2_probe, regex(common_elab_probe_lang, ignore_case = TRUE))) %>% 
  mutate(issue_rel_probe_flag = #!is.na(issue_2_probe) & 
           str_detect(issue_2_probe, regex(common_rel_probe_lang, ignore_case = TRUE)))

## Econ Conditions
combined <- combined %>%
  mutate(econd_elab_probe_flag = #!is.na(econd_2_probe) & 
           !(str_detect(econd_2_probe, regex(common_not_elab_probe_lang, ignore_case = TRUE))) &
           str_detect(econd_2_probe, regex(common_elab_probe_lang, ignore_case = TRUE))) %>% 
  mutate(econd_rel_probe_flag = #!is.na(econd_2_probe) & 
           str_detect(econd_2_probe, regex(cmb_rgx(common_rel_probe_lang, 
                                                   "conducting a market research|rather than|provide your thoughts on|i was asking about"), ignore_case = TRUE)))

## News
combined <- combined %>%
  mutate(news_elab_probe_flag = #!is.na(news_2_probe) & 
           !(str_detect(news_2_probe, regex(common_not_elab_probe_lang, ignore_case = TRUE))) &
           str_detect(news_2_probe, regex(cmb_rgx(common_elab_probe_lang, 
                                                  "which|the most|primary source|when it comes to|do you also rely on|do you tend|what do you like about"), ignore_case = TRUE))) %>% 
  mutate(news_rel_probe_flag = #!is.na(news_2_probe) & 
           str_detect(news_2_probe, regex(cmb_rgx(common_rel_probe_lang, 
                                                  "important to|that's okay"), ignore_case = TRUE)))

## Occupation
combined <- combined %>%
  mutate(occu_elab_probe_flag = #!is.na(occu_2_probe) & 
           !(str_detect(occu_2_probe, regex(common_not_elab_probe_lang, ignore_case = TRUE))) &
           str_detect(occu_2_probe, regex(cmb_rgx(common_elab_probe_lang, "entails|typical day|daily|kind of restaurant|subject|responsibilities|describe"), ignore_case = TRUE))) %>% 
  mutate(occu_rel_probe_flag = #!is.na(occu_2_probe) & 
           str_detect(occu_2_probe, regex(common_rel_probe_lang, ignore_case = TRUE)))


if (FALSE) { # Het FX template
  purrr::map_dfr(c("work_status","gender","educ5","age5","device_type","hh_inc5"),
                 function(var) {
                   # message(" ",var)
                   purrr::map_dfr(unique(sort(combined[[var]])) |>
                                    na.omit() |>
                                    str_filter("(Other|Refused|IDK)"),
                                  function(val) {
                                    combined %>%
                                      filter_at(var, ~.x == val)
                                  })
                 })
}

message("READY.")